library(tidyverse)
library(janitor)
library(GGally)

Question 1

kc_house <- read_csv("data/kc_house_data.csv")
kc_house
house_clean <- kc_house %>%
  select(-c(id, date, sqft_living, sqft_lot, sqft_living15,sqft_lot15, zipcode, lat, long))
house_clean
house_clean %>% 
  summarise(across(.cols = everything(), .fns = ~ sum(is.na(.))))

changing grade to 5 categories from 1 to 13

house_clean <- house_clean %>% 
  mutate(grade = case_when(
      grade > 0 & grade <= 3 ~ "falls short",
     grade > 3 &  grade <= 6 ~ "below average",
                  grade == 7 ~ "average",
      grade > 7 &  grade <= 10 ~ "above average",
      TRUE                      ~  "high quality"
    ))

changing waterfront to logical variable

house_clean <- house_clean %>% 
  mutate(waterfront = if_else(waterfront == 0, FALSE, TRUE)) 
house_clean

changing yr_renovated to logical variable

house_clean <- house_clean %>% 
  mutate(renovated = if_else(yr_renovated == 0, FALSE, TRUE)) %>% 
  select(-yr_renovated)
house_clean

# Question 2

model <- lm(price ~ ., data = house_clean)

model %>% 
alias()
## Model :
## price ~ bedrooms + bathrooms + floors + waterfront + view + condition + 
##     grade + sqft_above + sqft_basement + yr_built + renovated
summary(model)
## 
## Call:
## lm(formula = price ~ ., data = house_clean)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -1614080  -112764    -9605    90843  3987682 
## 
## Coefficients:
##                      Estimate Std. Error t value Pr(>|t|)    
## (Intercept)         6.214e+06  1.467e+05  42.358  < 2e-16 ***
## bedrooms           -3.980e+04  2.085e+03 -19.091  < 2e-16 ***
## bathrooms           4.821e+04  3.588e+03  13.436  < 2e-16 ***
## floors              4.034e+04  3.851e+03  10.474  < 2e-16 ***
## waterfrontTRUE      5.400e+05  1.910e+04  28.279  < 2e-16 ***
## view                5.048e+04  2.302e+03  21.926  < 2e-16 ***
## condition           2.084e+04  2.562e+03   8.134 4.39e-16 ***
## gradeaverage       -1.028e+05  4.057e+03 -25.338  < 2e-16 ***
## gradebelow average -1.615e+05  6.566e+03 -24.601  < 2e-16 ***
## gradefalls short   -1.033e+05  1.110e+05  -0.931  0.35203    
## gradehigh quality   4.869e+05  1.152e+04  42.280  < 2e-16 ***
## sqft_above          2.088e+02  3.249e+00  64.273  < 2e-16 ***
## sqft_basement       1.991e+02  4.638e+00  42.934  < 2e-16 ***
## yr_built           -3.132e+03  7.365e+01 -42.522  < 2e-16 ***
## renovatedTRUE       2.808e+04  8.016e+03   3.503  0.00046 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 221500 on 21598 degrees of freedom
## Multiple R-squared:  0.6363, Adjusted R-squared:  0.636 
## F-statistic:  2699 on 14 and 21598 DF,  p-value: < 2.2e-16

Question 3

houses_tidy_numeric <- house_clean %>%
  select_if(is.numeric)

houses_tidy_nonnumeric <- house_clean %>%
  select_if(function(x) !is.numeric(x))

houses_tidy_nonnumeric$price <- house_clean$price

ggpairs(houses_tidy_numeric)

ggpairs(houses_tidy_nonnumeric)

Now wil build a regression model containing the four main effects with price

mod1a <- lm(price ~ renovated, data = house_clean)
summary(mod1a)
## 
## Call:
## lm(formula = price ~ renovated, data = house_clean)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -650379 -215361  -85361  104639 6939621 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept)     530361       2532  209.51   <2e-16 ***
## renovatedTRUE   230018      12310   18.69   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 364200 on 21611 degrees of freedom
## Multiple R-squared:  0.0159, Adjusted R-squared:  0.01585 
## F-statistic: 349.2 on 1 and 21611 DF,  p-value: < 2.2e-16
mod2a <- lm(price ~ grade, data = house_clean)
summary(mod2a)
## 
## Call:
## lm(formula = price ~ grade, data = house_clean)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -1258635  -147590   -43590    96410  6021365 
## 
## Coefficients:
##                    Estimate Std. Error t value Pr(>|t|)    
## (Intercept)          665392       2912 228.466  < 2e-16 ***
## gradeaverage        -262802       4214 -62.370  < 2e-16 ***
## gradebelow average  -370168       6674 -55.462  < 2e-16 ***
## gradefalls short    -475642     144313  -3.296 0.000983 ***
## gradehigh quality   1013243      13205  76.734  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 288600 on 21608 degrees of freedom
## Multiple R-squared:  0.3823, Adjusted R-squared:  0.3822 
## F-statistic:  3343 on 4 and 21608 DF,  p-value: < 2.2e-16
mod3a <- lm(price ~ bedrooms, data = house_clean)
summary(mod3a)
## 
## Call:
## lm(formula = price ~ bedrooms, data = house_clean)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -3506435  -203235   -66667   105049  6839901 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)   129802       8932   14.53   <2e-16 ***
## bedrooms      121716       2554   47.65   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 349200 on 21611 degrees of freedom
## Multiple R-squared:  0.09508,    Adjusted R-squared:  0.09504 
## F-statistic:  2271 on 1 and 21611 DF,  p-value: < 2.2e-16

the most effect for the price as R-squared 95%

mod4a <- lm(price ~ floors, data = house_clean)
summary(mod4a)
## 
## Call:
## lm(formula = price ~ floors, data = house_clean)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -597965 -203837  -73787  103213 6984329 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)   279198       7102   39.31   <2e-16 ***
## floors        174589       4470   39.06   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 354800 on 21611 degrees of freedom
## Multiple R-squared:  0.06594,    Adjusted R-squared:  0.0659 
## F-statistic:  1526 on 1 and 21611 DF,  p-value: < 2.2e-16